#---------------------------------------------------------
# Are Intermediary Constraints Priced?
# Du, Hébert, Huber
# The Review of Financial Studies 2022
#---------------------------------------------------------

#---------------------------------------------------------
# OptionMetrics Dataset -
# (1) Clean data following Constantinides Jackwerth Savov RAPS 2013 
# (2) Form portfolio: 54 in CJS, 18 in HKM, here, collapse the 54 into 18 along different dimension than HKM
# NOTE: this file should run twice, once with lev_adj <- 1, and once with lev_adj <- 0.9
# Warning: this code is memory-intensive to run
#---------------------------------------------------------

# Sources:
#   options - OptionMetrics options price data: 108105 S&P 500 Index - SPX; European only; index only
#             variables: secid, date, symbol, symbol_flag, exdate, last_date, cp_flag, strike_price, best_bid, best_offer
#                        volume, open_interest, impl_volatility, delta, gamma, vega, theta, optionid
#   sp500 - CRSP
#   divyield - OptionMetrics Market Index Dividend Yield

#---------------------------------------------------------
# Set-up
#---------------------------------------------------------

if (exists('script_path')) {
  rm(list = setdiff(ls(), 'script_path'))
} else {
  args = commandArgs(trailingOnly = T)
  if (length(args) > 0) {
    script_path <- args[1]  
    rm(list = setdiff(ls(), 'script_path'))
  } else {
    rm(list = ls())
    script_path <- '~/Dropbox/ForwardArbitrage/CodeReplication/'
  } 
}

library(svMisc)
library(dplyr)
library(ggplot2)
library(derivmkts)
library(lubridate)
library(bizdays)
library(timeDate) #NYSE business day calendar, in lieu of QuantLib
library(mvtnorm)
library(data.table)
library(reshape2) #melt function used for tidyverse data frame
library(lazyeval) #interp function used

options = fread(paste0(script_path, 'DataSkeleton/AssetClasses/OptionMetric_07202021.csv'), stringsAsFactors = F) #read.csv is too slow
sp500 = read.csv(paste0(script_path, 'DataSkeleton/AssetClasses/SP500_index_07202021.csv'), stringsAsFactors = F)
divyield = read.csv(paste0(script_path, 'DataSkeleton/AssetClasses/SP500_div_yield_07202021.csv'), stringsAsFactors = F)

init_time <- proc.time()
start_date <- ymd('1996-01-04')
end_date <- ymd('2020-12-31')

lev_adj <- 1 #options are {0, .9, .95, 1}
if (!(lev_adj %in% c(0, .9, .95, 1))) {
  stop
}

# Implied volatility: Call option
implied_call <- function(data){
  
  # Set the tolerance
  .tol = .Machine$double.eps^0.5
  
  # Determine violations
  mid = (data$best_bid + data$best_offer)/2
  min_violation = mid <= data$sp500 * exp(-data$div_yield * data$maturity) - data$strike_price * exp(-data$riskfree * data$maturity)
  max_violation = mid > data$strike_price * exp(-data$div_yield * data$maturity)
  violation = (min_violation==T) | (max_violation==T)
  
  # Compute objective function
  f <- function(v, data, mid){
    return(bscall(data$sp500, data$strike_price, v, data$riskfree, data$maturity, data$div_yield)-mid)
  }
  
  # Function to search for zero
  srch_zero <- function(f, datapoint, midpoint){
    try(x <- uniroot(function(v) f(v,datapoint, midpoint), c(0.001, 1000), tol = .tol), silent = T)
    return(ifelse(exists('x'), x$root, NA))
  }
  
  # Apply to full vector
  results = rep(NA, length(violation))
  results[violation==F] = sapply(which(violation==F), function(i) srch_zero(f, data[i,], mid[i]))
  return(results)
}

# Implied volatility: Put option
implied_put <- function(data){
  
  # Extracting mid
  mid = (data$best_bid + data$best_offer)/2
  
  # Implied call price: put-call parity
  call_price = mid + data$sp500 * exp(-data$div_yield * data$maturity) - data$strike_price * exp(-data$riskfree * data$maturity)
  data$best_bid = call_price
  data$best_offer = call_price
  
  # Use call implied vol
  results = implied_call(data)
  return(results)
}

# Implied volatility: All
implied_vol <- function(data){
  
  # Divide data into puts and calls
  call_data = data[data$cp_flag=='C',]
  put_data = data[data$cp_flag=='P',]
  
  # Generate implied volatility vectors
  call_impvol = implied_call(call_data)
  put_impvol = implied_put(put_data)
  
  # Return correct vector
  results = rep(NA, NROW(data))
  results[data$cp_flag=='C']=call_impvol
  results[data$cp_flag=='P']=put_impvol
  return(results)
}

####################################
# PRELIMINARY MANIPULATIONS ########
####################################

# Set date
options[, date := ymd(date)]
options[, exdate := ymd(exdate)]
options[, last_date := ymd(last_date)]

# Slice dataset: start_date to end_date
options = options[date <= end_date & date >= start_date]

# Merging the S&P500 dataset by date
names(sp500) = c('date', 'sp500')
sp500$date <- ymd(sp500$date)
options = merge(options, sp500, by='date', all.x = T)

# Merging the SPX dividend yield by date
names(divyield) = c('code','date','div_yield')
divyield$date <- ymd(divyield$date)
divyield$div_yield = divyield$div_yield/100
divyield$code = NULL
options = merge(options, divyield, by='date', all.x = T)

# Creating some variables
# Moneyness
options$strike_price = options$strike_price/1000
options$moneyness = options$strike_price/options$sp500

# Maturity
options$maturity = as.numeric((options$exdate-options$date)/360)

# Time the process and report number of obs
proc.time()-init_time
print(paste0(deparse(substitute(options)),' nobs: ',NROW(options)))

######################################
# FILTERING (CJS, 2013) ##############
######################################

###########################
# LEVEL 1 FILTERS #########
###########################

#####################
# Identical filter ##
#####################

# Removing duplicates: Date, Type, Strike, Exp, Price
level_1 = options %>% distinct(date, cp_flag, strike_price, exdate, best_bid, .keep_all = T)
level_1 = as.data.frame(level_1)

# Report the number of obs
print(paste0(deparse(substitute(level_1)),' nobs: ',NROW(level_1)))

#####################
# Bid = 0 filter ####
#####################

# Removing bids equal to 0
level_1 = level_1 %>% filter(best_bid!=0)

# Report the number of obs
print(paste0(deparse(substitute(level_1)),' nobs: ',NROW(level_1)))

#####################
# Id. exc P filter ##
#####################

# Preparing data to compute the filter

# Creating price count per option
level_1 = level_1 %>% group_by(date, cp_flag, strike_price, exdate) %>% mutate(n_prices = n())
level_1 = as.data.frame(level_1)

# Removing observations where n_prices>1 and impl_volatility==NA
level_1 = subset(level_1, n_prices==1 | (n_prices>1 & is.na(impl_volatility)==F))

# Refreshing the price count per option
level_1 = level_1 %>% group_by(date, cp_flag, strike_price, exdate) %>% mutate(n_prices = n())
level_1 = as.data.frame(level_1)

# Generating Id. exc. Price groups: identical_group
level_1$identical_group = NA
identical_idx = which(level_1$n_prices>1)
group = group_indices_(level_1[identical_idx,], .dots=c('date','cp_flag','strike_price','exdate'))
level_1[identical_idx, 'identical_group'] = group

remove(identical_idx)

# Retrieving unique groups
identical_groups = unique(group)

remove(group)

# Creating search datasets
search_data = level_1 %>% arrange(moneyness) %>% select(moneyness, impl_volatility)
search_data = na.omit(search_data)

# Filtering: Looping through the identical groups
delete_idx = c()
for (grp in identical_groups){
  
  progress(which(identical_groups==grp),length(identical_groups))
  
  # Subsetting the data
  data_idx = which(level_1$identical_group==grp)
  tmp_data = level_1[data_idx,]
  
  # Getting indices in tmp_data where na_idx is NA
  na_idx = which(is.na(tmp_data$impl_volatility))
  
  # If imp_volatility not found, add to the delete_idx
  na_add = data_idx[which(is.na(tmp_data$impl_volatility))]
  delete_idx = c(delete_idx, na_add)
  
  # Update data_idx
  data_idx = data_idx[!(data_idx %in% na_add)]
  
  # If data_idx is not empty, select price to be kept
  if (length(data_idx) != 0){
    
    # Update tmp_data
    tmp_data = level_1[data_idx,]
    
    # Obtain moneyness (should be the same for all)
    KP = unique(tmp_data$moneyness)
    
    # Finding the average implied volatility of neighbors
    kp_idx = which.min(abs(search_data$moneyness - KP))
    start = max(kp_idx-500,0)
    stop = min(NROW(search_data),kp_idx+500)
    mu_neighbors = mean(search_data$impl_volatility[start:stop])
    
    # Assign the largest distance to na_add and update delete_idx
    na_add = data_idx[which.max(abs(tmp_data$moneyness - mu_neighbors))]
    delete_idx = c(delete_idx, na_add)
  }
}

# Applying the filters: Removing data in delete_idx
if (!is.null(delete_idx)) {
  level_1 = level_1[-delete_idx, ]
}

remove(search_data, tmp_data, grp, start, stop)
remove(identical_groups)
remove(data_idx, delete_idx)
remove(kp_idx, KP, mu_neighbors, na_idx)
remove(na_add)

# Report the number of obs
print(paste0(deparse(substitute(level_1)),' nobs: ',NROW(level_1)))

###########################
# LEVEL 2 FILTERS #########
###########################

# Removing useless columns
level_2 = level_1
level_2$n_prices = NULL
level_2$identical_group = NULL

#####################
# Days to maturity ##
#####################
level_2 = level_2 %>% filter(maturity >= 7/360, maturity <= 180/360)

# Report the number of obs
print(paste0(deparse(substitute(level_2)),' nobs: ',NROW(level_2)))

#####################
# Implied vol #######
#####################

# Removing implied volatilities but keeping NAs
level_2 = level_2 %>% filter(is.na(impl_volatility) | (impl_volatility>=0.05 & impl_volatility<=1.0))

# Report the number of obs
print(paste0(deparse(substitute(level_2)),' nobs: ',NROW(level_2)))

#####################
# Moneyness #########
#####################
level_2 = level_2 %>% filter(moneyness >= 0.8, moneyness<=1.2)

# Report the number of obs
print(paste0(deparse(substitute(level_2)),' nobs: ',NROW(level_2)))

#####################
# Implied Int. Rate #
#####################

# Splitting the data into call and put
call_data = level_2 %>% filter(cp_flag=='C') %>% select('date', 'exdate',
                                                        'strike_price', 'best_bid', 'best_offer', 'sp500', 'div_yield',
                                                        'maturity', 'moneyness')
put_data = level_2 %>% filter(cp_flag=='P') %>% select('date', 'exdate',
                                                       'strike_price', 'best_bid', 'best_offer', 'sp500', 'div_yield',
                                                       'maturity', 'moneyness')

# Changing the column names: Flags for call and put to merge
names(call_data) = c('date', 'exdate', 'strike_price', 'best_bid_C',
                     'best_offer_C', 'sp500_C', 'div_yield_C', 'maturity_C',
                     'moneyness_C')

names(put_data) = c('date', 'exdate', 'strike_price', 'best_bid_P',
                    'best_offer_P', 'sp500_P', 'div_yield_P', 'maturity_P',
                    'moneyness_P')

# Merging the data to create call-put pairs
callput_data = merge(call_data, put_data, by=c('date','exdate','strike_price'))
callput_data$sp500_P = NULL
callput_data$maturity_P = NULL
callput_data$div_yield_P = NULL
callput_data$moneyness_P = NULL
names(callput_data) = c('date', 'exdate', 'strike_price', 'best_bid_C',
                        'best_offer_C', 'sp500', 'div_yield', 'maturity',
                        'moneyness', 'best_bid_P', 'best_offer_P')

remove(call_data, put_data)

# Computing the implied rate
callput_data$mid_C = (callput_data$best_bid_C+callput_data$best_offer_C)/2
callput_data$mid_P = (callput_data$best_bid_P+callput_data$best_offer_P)/2
callput_data = callput_data %>% mutate(rate = -log((sp500*exp(-div_yield*maturity) +
                                                      mid_P - mid_C)/strike_price)*(1/maturity))

# Reporting the negative pairs
print(paste0('Negative rate pairs: ', sum(callput_data$rate<0)))

# Merging the implied rate to level_2
callput_data = callput_data %>% select('date','exdate', 'strike_price', 'rate')
level_2 = merge(level_2, callput_data, by=c('date', 'exdate', 'strike_price'), all.x = T)

# Removing negative implied interest rates: Keep NAs (They still have a rate, just no Put-Call pair)
level_2 = level_2 %>% filter(is.na(rate) | rate >= 0)

# Report the number of obs
print(paste0(deparse(substitute(level_2)),' nobs (before interpolating): ',NROW(level_2)))

# Extracting the data to compute the median
callput_data = level_2 %>% filter(moneyness >=0.95, moneyness<=1.05)

# Computing the median
callput_data = callput_data %>% group_by(date,exdate) %>% summarize(rate_med=median(rate,na.rm = T))

# Interpolating curve per day

# Re-ordering the data by date then exdate
callput_data = callput_data[order(callput_data$date, callput_data$exdate),]
callput_data = as.data.frame(callput_data)

rate_med_fill = c()
interpolation = NA
for (dt in unique(callput_data$date)){
  
  # Extracting the data frame
  tmp_data = callput_data[callput_data$date==dt,]
  
  # Interpolating
  remove(interpolation)
  try(interpolation <- approx(tmp_data$rate_med, xout = 1:NROW(tmp_data))$y, silent=T)
  if (exists('interpolation')==F){
    interpolation = tmp_data$rate_med
  }
  
  # Storing interpolation
  rate_med_fill = c(rate_med_fill, interpolation)
}

remove(dt, interpolation, tmp_data)

# Replacing rate_med
callput_data$rate_med = rate_med_fill

remove(rate_med_fill)

# Interpolating time series

# Re-ordering the data frame by maturity
callput_data$maturity = callput_data$exdate - callput_data$date
callput_data = callput_data[order(callput_data$maturity, callput_data$date),]
# callput_data = callput_data[order(callput_data$maturity, as.Date(callput_data$date, '%d/%m/%Y')),]

# Interpolating
rate_med_fill = c()
interpolation = NA
for (mty in unique(callput_data$maturity)){
  
  # Extracting the data frame
  tmp_data = callput_data[callput_data$maturity==mty,]
  
  # Interpolating
  remove(interpolation)
  try(interpolation <- approx(tmp_data$rate_med, xout = 1:NROW(tmp_data))$y, silent=T)
  if (exists('interpolation')==F){
    interpolation = tmp_data$rate_med
  }
  
  # Storing interpolation
  rate_med_fill = c(rate_med_fill, interpolation)
}

remove(mty, interpolation, tmp_data)

# Replacing rate_med
callput_data$rate_med = rate_med_fill

remove(rate_med_fill)

# Filling the blanks in level_2
callput_data$maturity = NULL
level_2 = merge(level_2, callput_data, by = c('date', 'exdate'), all.x = T)

# DECISION: Remove the options where we cannot impute rates
level_2 = level_2[!(is.na(level_2$rate_med)),]

# Setting the correct riskfree rate
level_2$riskfree = level_2$rate_med

remove(callput_data)

# Report the number of obs
print(paste0(deparse(substitute(level_2)),' nobs (after interpolating): ',NROW(level_2)))

#####################
# Unable to get IV ##
#####################

# Computing the implied volatilities from the implied rates
level_2$impvol_from_imprate = implied_vol(level_2)

# Removing data where impvol_from_imprate==NA
level_2 = level_2[!(is.na(level_2$impvol_from_imprate)),]

# Report the number of obs
print(paste0(deparse(substitute(level_2)),' nobs (after interpolating): ',NROW(level_2)))


###########################
# LEVEL 3 FILTERS #########
###########################

#####################
# IV filter #########
#####################

# Reordering the data: Date, maturity, strike
# level_3 = level_2[order(as.Date(level_2$date, '%d/%m/%Y'), as.Date(level_2$exdate, '%d/%m/%Y'), level_2$strike_price),]
level_3 = level_2[order(level_2$date, level_2$exdate, level_2$strike_price),]

# Creating the moneyness bins
level_3$moneyness_bins = cut(level_3$moneyness, seq(0.8,1.2,0.05), include.lowest = T, right = T)

# Creating the quadratic fit
level_3 = level_3 %>% group_by(cp_flag, date, exdate) %>% mutate(vol_fit = predict(lm(log(impvol_from_imprate)~strike_price+strike_price^2)))
level_3 = as.data.frame(level_3)

# Computing the distance and standard deviation per bin
level_3 = level_3 %>% group_by(moneyness_bins) %>% mutate(bin_stdev = sqrt(var(vol_fit - log(impvol_from_imprate))))

# Generating the confidence bands
level_3$vol_lower = mean(level_3$vol_fit - log(level_3$impvol_from_imprate)) - 2*level_3$bin_stdev
level_3$vol_upper = mean(level_3$vol_fit - log(level_3$impvol_from_imprate)) + 2*level_3$bin_stdev

# Computing the criteria to delete
keep_idx = !(level_3$vol_fit - log(level_3$impvol_from_imprate) >
               level_3$vol_upper | level_3$vol_fit - log(level_3$impvol_from_imprate) < level_3$vol_lower)

# Keeping relevant data
level_3 = level_3[keep_idx,]

# Report the number of obs
print(paste0(deparse(substitute(level_3)),' nobs: ',NROW(level_3)))

#####################
# Call-Put filter ###
#####################

# Computing the confidence interval
rate_stdev = sqrt(var(level_3$rate - level_3$riskfree, na.rm = T))
rate_mean = mean(level_3$rate - level_3$riskfree, na.rm = T)
rate_upper = rate_mean + 2*rate_stdev
rate_lower = rate_mean - 2*rate_stdev

# Computing the criteria to delete
keep_idx = !(level_3$rate - level_3$riskfree < rate_lower|level_3$rate - level_3$riskfree > rate_upper) & !is.na(level_3$rate)

# Keeping the relevant data
level_3 = level_3[keep_idx,]
level_3 = as.data.frame(level_3)

# Report the number of obs
print(paste0(deparse(substitute(level_3)),' nobs: ',NROW(level_3)))

remove(rate_stdev, rate_mean, rate_upper, rate_lower, keep_idx)

######################################
# GENERATING RETURNS (CJS, 2013) #####
######################################

###########################
# FINDING TRADE-OUT MID ###
###########################

# Removing clutter
keep_vars = c('date', 'exdate', 'strike_price', 'cp_flag',
              'best_bid', 'best_offer', 'delta', 'sp500',
              'riskfree', 'div_yield', 'moneyness', 'maturity',
              'rate', 'impvol_from_imprate')
options_panel = level_3[,keep_vars]
options_panel = as.data.frame(options_panel)

remove(keep_vars)

# Computing the mid_price
options_panel$mid = (options_panel$best_bid + options_panel$best_offer)/2

# Finding the next business date
unique_dates = unique(options$date)[order(unique(options$date))]
date_index = match(options_panel$date, unique_dates)
date_index = date_index + 1
unique_dates = c(unique_dates, NA)
dates = unique_dates[date_index]
options_panel$next_date = as.Date(dates)

remove(unique_dates, date_index, dates)

#####################
# From clean data ###
#####################

# Merging the previous mid price and re-ordering the data
tmp_data = options_panel[,c('date','exdate','strike_price', 'cp_flag', 'mid')]
names(tmp_data)[names(tmp_data)=='mid'] = 'next_mid'
options_panel = merge(options_panel, tmp_data, by.x = c('next_date', 'exdate', 'strike_price', 'cp_flag'),
                      by.y = c('date', 'exdate', 'strike_price', 'cp_flag'), all.x = T)
options_panel = options_panel[order(options_panel$date,
                                    options_panel$exdate,
                                    options_panel$strike_price),]
remove(tmp_data)

# Re-ordering the data frame
keep_vars = c('date', 'exdate', 'strike_price', 'cp_flag',
              'best_bid', 'best_offer', 'delta', 'sp500',
              'riskfree', 'div_yield', 'moneyness', 'maturity',
              'rate', 'impvol_from_imprate', 'mid', 'next_date', 'next_mid')
options_panel = options_panel[,keep_vars]

remove(keep_vars)

#####################
# From orig data ####
#####################

# Dropping variables where next_date == NA
options_panel = options_panel[!(is.na(options_panel$next_date)),]

# Finding prices in original data
tmp_data = options[,c('date','exdate','strike_price', 'cp_flag', 'best_bid', 'best_offer')]
tmp_data$next_mid_orig = (tmp_data$best_bid + tmp_data$best_offer)/2
tmp_data$best_bid = NULL
tmp_data$best_offer = NULL
options_panel = merge(options_panel, tmp_data, by.x = c('next_date', 'exdate', 'strike_price', 'cp_flag'),
                      by.y = c('date', 'exdate', 'strike_price', 'cp_flag'), all.x = T)
options_panel = options_panel[order(options_panel$date,
                                    options_panel$exdate,
                                    options_panel$strike_price),]

remove(tmp_data)

# Re-ordering the data frame
keep_vars = c('date', 'exdate', 'strike_price', 'cp_flag',
              'best_bid', 'best_offer', 'delta', 'sp500',
              'riskfree', 'div_yield', 'moneyness', 'maturity',
              'rate', 'impvol_from_imprate', 'mid', 'next_date', 'next_mid', 'next_mid_orig')
options_panel = options_panel[,keep_vars]

remove(keep_vars)

# Filling out the NAs in next_mid with next_mid_orig
options_panel$next_mid = ifelse(is.na(options_panel$next_mid), options_panel$next_mid_orig, options_panel$next_mid)
options_panel$next_mid_orig = NULL

#####################
# From eminent exp ##
#####################

# Computing the month-year ID
options_panel$monthyear = paste0(month(options_panel$date),
                                 year(options_panel$date))

# Creating a is_last==TRUE flag where quote is the last in the month
options_panel = options_panel %>% group_by(cp_flag, monthyear, exdate, strike_price) %>% mutate(is_last = (last(date)==date))
options_panel = as.data.frame(options_panel)

# Updating the next_date if close to expiration: Expiration is Saturday, date is Friday
options_panel$next_date = ifelse(is.na(options_panel$next_mid) & options_panel$maturity<(14/360) & options_panel$cp_flag=='C' & options_panel$is_last==T,
                                 options_panel$exdate - 1,
                                 ifelse(is.na(options_panel$next_mid) & options_panel$maturity<(14/360) & options_panel$cp_flag=='P' & options_panel$is_last==T,
                                        options_panel$exdate - 1, options_panel$next_date))
options_panel$next_date = as.Date(options_panel$next_date, origin = ymd('1970-01-01'))

# Search for the SP500 index in expiration
tmp_data = sp500
names(tmp_data)[names(tmp_data)=='sp500']='next_sp500'
options_panel = merge(options_panel, tmp_data, by.x = c('next_date'),
                      by.y = c('date'), all.x = T)
options_panel = options_panel[order(options_panel$date,
                                    options_panel$exdate,
                                    options_panel$strike_price),]

remove(tmp_data)

# Re-ordering the data frame
keep_vars = c('date', 'exdate', 'strike_price', 'cp_flag',
              'best_bid', 'best_offer', 'delta', 'sp500',
              'riskfree', 'div_yield', 'moneyness', 'maturity',
              'rate', 'impvol_from_imprate', 'mid', 'next_date',
              'next_mid', 'next_sp500', 'monthyear', 'is_last')
options_panel = options_panel[,keep_vars]

remove(keep_vars)

# Setting next_mid as expiration payoff if close to expiration
options_panel$next_mid = ifelse(is.na(options_panel$next_mid) & options_panel$maturity<(14/360) & options_panel$cp_flag=='C' & options_panel$is_last==T,
                                pmax(0,options_panel$next_sp500 - options_panel$strike_price),
                                ifelse(is.na(options_panel$next_mid) & options_panel$maturity<(14/360) & options_panel$cp_flag=='P' & options_panel$is_last==T,
                                       pmax(0, options_panel$strike_price - options_panel$next_sp500),
                                       options_panel$next_mid))

# Remove clutter
options_panel$next_sp500 = NULL

#####################
# From re-appear ####
#####################

# Creating an index of order by month for each asset
options_panel = options_panel %>% group_by(cp_flag, monthyear, exdate, strike_price) %>% mutate(order_idx = row_number())
options_panel = as.data.frame(options_panel)
options_panel$next_order_idx = options_panel$order_idx + 1

# Merge the date based on order_index
tmp_data = options_panel[,c('cp_flag', 'monthyear', 'exdate', 'strike_price', 'order_idx', 'date')]
names(tmp_data)[names(tmp_data)=='date']='reappear_date'
options_panel = merge(options_panel, tmp_data, by.x = c('cp_flag', 'monthyear', 'exdate', 'strike_price', 'next_order_idx'),
                      by.y = c('cp_flag', 'monthyear', 'exdate', 'strike_price', 'order_idx'), all.x = T)
options_panel = options_panel[order(options_panel$date,
                                    options_panel$exdate,
                                    options_panel$strike_price),]
remove(tmp_data)

# Re-ordering the data frame
keep_vars = c('date', 'exdate', 'strike_price', 'cp_flag',
              'best_bid', 'best_offer', 'delta', 'sp500',
              'riskfree', 'div_yield', 'moneyness', 'maturity',
              'rate', 'impvol_from_imprate', 'mid', 'next_date',
              'next_mid', 'monthyear', 'is_last', 'order_idx', 'next_order_idx', 'reappear_date')
options_panel = options_panel[,keep_vars]

remove(keep_vars)

# Merge the mid based on reappear_date
tmp_data = options_panel[,c('date', 'exdate', 'strike_price', 'cp_flag', 'mid')]
names(tmp_data)[names(tmp_data)=='mid'] = 'reappear_mid'
options_panel = merge(options_panel, tmp_data, by.x = c('reappear_date', 'exdate', 'strike_price', 'cp_flag'),
                      by.y = c('date', 'exdate', 'strike_price', 'cp_flag'), all.x = T)
options_panel = options_panel[order(options_panel$date,
                                    options_panel$exdate,
                                    options_panel$strike_price),]
remove(tmp_data)

# Re-ordering the data frame
keep_vars = c('date', 'exdate', 'strike_price', 'cp_flag',
              'best_bid', 'best_offer', 'delta', 'sp500',
              'riskfree', 'div_yield', 'moneyness', 'maturity',
              'rate', 'impvol_from_imprate', 'mid', 'next_date',
              'next_mid', 'monthyear', 'is_last', 'order_idx', 'next_order_idx', 'reappear_date', 'reappear_mid')
options_panel = options_panel[,keep_vars]

remove(keep_vars)

# Replacing next_date with reappear_date if next_mid==NA and is_last==F
options_panel$next_date = ifelse(is.na(options_panel$next_mid) & options_panel$is_last==F,
                                 options_panel$reappear_date,
                                 options_panel$next_date)
options_panel$next_date = as.Date(options_panel$next_date, origin = ymd('1970-01-01'))

# Filling out next_mid with reappear_mid if next_mid==NA and is_last==F
options_panel$next_mid = ifelse(is.na(options_panel$next_mid) & options_panel$is_last==F,
                                options_panel$reappear_mid,
                                options_panel$next_mid)

# Removing clutter
options_panel$reappear_date = NULL
options_panel$reappear_mid = NULL
options_panel$next_order_idx = NULL

#####################
# From vol surface ##
#####################

# Computing the volatility surface fit for next_date and cp_flag
options_panel = options_panel %>% group_by(cp_flag, next_date) %>% 
  mutate(vol_surf_fit = exp(predict(lm(log(impvol_from_imprate)~maturity+moneyness^2+maturity*moneyness))))
options_panel = as.data.frame(options_panel)

# Computing the option price from the volatility surface fit
options_panel$implied_mid = NA
options_panel[options_panel$cp_flag=='C', "implied_mid"] = bscall(s = options_panel[options_panel$cp_flag=='C',"sp500"],
                                                                  k = options_panel[options_panel$cp_flag=='C',"strike_price"],
                                                                  v = options_panel[options_panel$cp_flag=='C',"vol_surf_fit"],
                                                                  r = options_panel[options_panel$cp_flag=='C',"riskfree"],
                                                                  tt = options_panel[options_panel$cp_flag=='C',"maturity"],
                                                                  d = options_panel[options_panel$cp_flag=='C',"div_yield"])
options_panel[options_panel$cp_flag=='P', "implied_mid"] = bsput(s = options_panel[options_panel$cp_flag=='P',"sp500"],
                                                                 k = options_panel[options_panel$cp_flag=='P',"strike_price"],
                                                                 v = options_panel[options_panel$cp_flag=='P',"vol_surf_fit"],
                                                                 r = options_panel[options_panel$cp_flag=='P',"riskfree"],
                                                                 tt = options_panel[options_panel$cp_flag=='P',"maturity"],
                                                                 d = options_panel[options_panel$cp_flag=='P',"div_yield"])

# Replacing the next_mid that are NAs
options_panel$next_mid = ifelse(is.na(options_panel$next_mid), options_panel$implied_mid, options_panel$next_mid)

###############
options_panel_before_return <- options_panel
save(options_panel_before_return, file = paste0(script_path, 'OutputInterim/Options/options_panel_before_return.RData'))

###########################
# COMPUTING RETURNS #######
###########################

load_rmetrics_calendars(1996:2020)
options_panel$day_diff = bizdays(options_panel$date, options_panel$next_date,
                                 'Rmetrics/NYSE')

# Computing daily returns
options_panel$daily_ret = ((options_panel$next_mid/options_panel$mid)^(1/options_panel$day_diff)) - 1

###########################
# GET RATES FOR LEV ADJ ###
###########################

#####################
# Making rate daily #
#####################

# Computing daily rates for all other rates
options_panel$riskfree_daily_rate = (options_panel$riskfree+1)^(1/360)-1

###########################
# COMPUTING OPTION DELTA ##
###########################

# Computing delta for calls and puts
d1 = (log(1/options_panel$moneyness) + options_panel$maturity*(options_panel$riskfree - options_panel$div_yield + 0.5*options_panel$impvol_from_imprate^2))/
  (options_panel$impvol_from_imprate*sqrt(options_panel$maturity))
delta_call = pnorm(d1)
delta_put = pnorm(d1)-1

# Assigning the correct delta
options_panel$custom_delta = ifelse(options_panel$cp_flag=='C',delta_call, delta_put)

remove(d1, delta_call, delta_put)

###########################
# COMPUTING LEV ADJUST ####
###########################

# Computing the weights
options_panel$w = 1/(options_panel$custom_delta*(options_panel$sp500/options_panel$mid))

# Computing the returns
options_panel$levadj_ret = (1 - lev_adj + lev_adj * options_panel$w) * options_panel$daily_ret + (lev_adj - lev_adj * options_panel$w) * options_panel$riskfree_daily_rate

######################################
# GENERATING PORTFOLIOS (CJS, 2013) ##
######################################

port_weights = options_panel

###########################
# GENERATING WEIGHTS ######
###########################

# Creating the relevant parameter list
target_maturity = c(30, 60, 90)
target_moneyness = seq(0.9,1.1,0.025)
sigma = diag(c(0.0125**2, 10**2))

# Creating a data frame of portfolio weights
for (mat in target_maturity){
  for (mon in target_moneyness){
    
    # Naming the portfolio
    port_name = paste0('port_',mat,'_',mon)
    
    # Generating the function to be run by mutate (getting around mutate issues)
    # mutate_call = interp(~dmvnorm(cbind(moneyness, maturity*360), mean = c(mon, mat), sigma = sigma))
    mutate_call = interp(~dmvnorm(cbind(moneyness, maturity*360), mean = c(mon, mat), sigma = sigma))
    
    # Creating the portfolio weights
    port_weights = port_weights %>% group_by(date, cp_flag) %>% mutate_(.dots = setNames(list(mutate_call), port_name))
  }
}

# Computing the vertical panel
port_weights = reshape2::melt(port_weights, names(options_panel))

# Normalizing the weights to sum up to 1
port_weights = port_weights %>% group_by(date, cp_flag, variable) %>% mutate(norm_weight = value/sum(value))
port_weights = as.data.frame(port_weights)

###########################
# GENERATING PORTFOLIOS ###
###########################

# Computing the weighted return
port_weights$weighted_ret = port_weights$levadj_ret * port_weights$norm_weight

# Aggregating to obtain portfolio returns
port_returns = port_weights %>% group_by(date, cp_flag, variable) %>% summarize(port_ret = sum(weighted_ret))
port_returns = as.data.frame(port_returns)

# Ordering the dataset
port_returns = port_returns[order(port_returns$date,
                                  port_returns$cp_flag, port_returns$variable),]

# Computing the month-year ID
port_returns$monthyear = paste0(month(port_returns$date),
                                year(port_returns$date))

# Aggregating by month and re-ordering
port_returns = port_returns %>% group_by(monthyear, cp_flag, variable) %>% summarize(port_ret = sum(port_ret))
port_returns$year = substr(port_returns$monthyear, nchar(port_returns$monthyear) - (4-1), nchar(port_returns$monthyear))
port_returns$year = as.numeric(port_returns$year)
port_returns$month = substr(port_returns$monthyear, 1, nchar(port_returns$monthyear)-4)
port_returns$month = as.numeric(port_returns$month)
port_returns$date = paste0('01/',sprintf('%02d',port_returns$month),'/',port_returns$year)
port_returns$date = dmy(port_returns$date)
port_returns = port_returns[order(port_returns$cp_flag, port_returns$date,
                                  port_returns$variable),]
port_returns$variable = paste0(port_returns$cp_flag, '_', port_returns$variable)

# Removing clutter
port_returns = port_returns[,c('date', 'variable', 'port_ret')]

# Reshaping the dataset
port_returns = dcast(port_returns, date~variable)
port_returns = port_returns[order(port_returns$date),]

# Saving the dataset
if (lev_adj == 1) {
  temp_title <- 'full'
} else if (lev_adj == 0.95) {
  temp_title <- '95adj'
} else if (lev_adj == 0.9) {
  temp_title <- '90adj'
} else if (lev_adj == 0) {
  temp_title <- 'noadj'
}
save(port_returns, file = paste0(script_path, 'OutputInterim/Options/port_returns_', temp_title,'.RData'))

print(proc.time() - init_time)

######################################
# GENERATING 18 PORTFOLIOS: DIFFERENT FROM HKM
######################################

port_returns <- data.table(port_returns)
for (cp in c('C', 'P')) {
  for (maturity in c(30, 60, 90)) {
    port_returns[, paste(cp, 'port', maturity, 'low', sep = '_') := apply(port_returns[, paste(cp, 'port', maturity, c(0.9, 0.925, 0.95), sep = '_'), with = F],
                                                                          1, function(x) mean(x, na.rm = T))]    
    port_returns[, paste(cp, 'port', maturity, 'mid', sep = '_') := apply(port_returns[, paste(cp, 'port', maturity, c(0.975, 1, 1.025), sep = '_'), with = F],
                                                                          1, function(x) mean(x, na.rm = T))]    
    port_returns[, paste(cp, 'port', maturity, 'high', sep = '_') := apply(port_returns[, paste(cp, 'port', maturity, c(1.05, 1.075, 1.1), sep = '_'), with = F],
                                                                           1, function(x) mean(x, na.rm = T))]    
  }
}

port_returns_18 <- port_returns[, c('date', names(port_returns)[grepl('low', names(port_returns)) | grepl('mid', names(port_returns)) | grepl('high', names(port_returns))]), with = F]

# Storing the resulting portfolios
save(port_returns_18, file = paste0(script_path, 'OutputInterim/Options/port_returns_', temp_title, '_18.RData'))
